package com.sky.orangehireserver.utils;

import org.apache.tika.Tika;
import org.apache.tika.exception.TikaException;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.parser.AutoDetectParser;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.sax.BodyContentHandler;
import org.springframework.stereotype.Service;
import org.xml.sax.SAXException;

import java.io.IOException;
import java.io.InputStream;

@Service
public class ResumeParseUtil {
    private final Tika tika = new Tika();
    /** 直接提取文本 */
    public String parseToString(InputStream is) throws IOException, TikaException {
        return tika.parseToString(is);
    }

    /** 高级用法：返回文本 + 元数据 */
    public String parseWithMetadata(InputStream is) throws IOException, TikaException, SAXException {
        AutoDetectParser parser = new AutoDetectParser();
        BodyContentHandler handler = new BodyContentHandler(-1); // -1 允许无限大小
        Metadata metadata = new Metadata();
        parser.parse(is, handler, metadata, new ParseContext());

        // 这里可以拿到作者、标题等元数据
        System.out.println("Title: " + metadata.get("title"));
        System.out.println("Content-Type: " + metadata.get("Content-Type"));

        return handler.toString();
    }
}
